In [1]:
import numpy as np
import scipy
import shap
import sklearn
import graphviz
In [2]:
X,y = shap.datasets.boston()
orig_model = sklearn.tree.DecisionTreeRegressor(max_depth=2)
orig_model.fit(X, y)
Out[2]:
In [3]:
dot_data = sklearn.tree.export_graphviz(orig_model, out_file=None, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[3]:
In [4]:
# extract the arrays that define the tree
children_left = orig_model.tree_.children_left
children_right = orig_model.tree_.children_right
children_default = children_right.copy() # because sklearn does not use missing values
features = orig_model.tree_.feature
thresholds = orig_model.tree_.threshold
values = orig_model.tree_.value.reshape(orig_model.tree_.value.shape[0], 1)
node_sample_weight = orig_model.tree_.weighted_n_node_samples
print(" children_left", children_left) # note that negative children values mean this is a leaf node
print(" children_right", children_right)
print(" children_default", children_default)
print(" features", features)
print(" thresholds", thresholds.round(3))
print(" values", values.round(3))
print("node_sample_weight", node_sample_weight)
In [5]:
# define a custom tree model
tree_dict = {
"children_left": children_left,
"children_right": children_right,
"children_default": children_default,
"features": features,
"thresholds": thresholds,
"values": values,
"node_sample_weight": node_sample_weight
}
model = {
"trees": [tree_dict]
}
In [6]:
explainer = shap.TreeExplainer(model)
In [7]:
# Make sure that the ingested SHAP model (a TreeEnsemble object) makes the
# same predictions as the original model
assert np.abs(explainer.model.predict(X) - orig_model.predict(X)).max() < 1e-4
In [8]:
# make sure the SHAP values sum up to the model output (this is the local accuracy property)
assert np.abs(explainer.expected_value + explainer.shap_values(X).sum(1) - orig_model.predict(X)).max() < 1e-4
In [9]:
X2,y2 = shap.datasets.adult()
orig_model2 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=2)
orig_model2.fit(X2, y2)
Out[9]:
In [10]:
tree_tmp = orig_model2.estimators_[0][0].tree_
# extract the arrays that define the tree
children_left1 = tree_tmp.children_left
children_right1 = tree_tmp.children_right
children_default1 = children_right1.copy() # because sklearn does not use missing values
features1 = tree_tmp.feature
thresholds1 = tree_tmp.threshold
values1 = tree_tmp.value.reshape(tree_tmp.value.shape[0], 1)
node_sample_weight1 = tree_tmp.weighted_n_node_samples
print(" children_left1", children_left1) # note that negative children values mean this is a leaf node
print(" children_right1", children_right1)
print(" children_default1", children_default1)
print(" features1", features1)
print(" thresholds1", thresholds1.round(3))
print(" values1", values1.round(3))
print("node_sample_weight1", node_sample_weight1)
In [11]:
tree_tmp = orig_model2.estimators_[1][0].tree_
# extract the arrays that define the tree
children_left2 = tree_tmp.children_left
children_right2 = tree_tmp.children_right
children_default2 = children_right2.copy() # because sklearn does not use missing values
features2 = tree_tmp.feature
thresholds2 = tree_tmp.threshold
values2 = tree_tmp.value.reshape(tree_tmp.value.shape[0], 1)
node_sample_weight2 = tree_tmp.weighted_n_node_samples
print(" children_left2", children_left2) # note that negative children values mean this is a leaf node
print(" children_right2", children_right2)
print(" children_default2", children_default2)
print(" features2", features2)
print(" thresholds2", thresholds2.round(3))
print(" values2", values2.round(3))
print("node_sample_weight2", node_sample_weight2)
In [12]:
# define a custom tree model
tree_dicts = [
{
"children_left": children_left1,
"children_right": children_right1,
"children_default": children_default1,
"features": features1,
"thresholds": thresholds1,
"values": values1 * orig_model2.learning_rate,
"node_sample_weight": node_sample_weight1
},
{
"children_left": children_left2,
"children_right": children_right2,
"children_default": children_default2,
"features": features2,
"thresholds": thresholds2,
"values": values2 * orig_model2.learning_rate,
"node_sample_weight": node_sample_weight2
},
]
model2 = {
"trees": tree_dicts,
"base_offset": scipy.special.logit(orig_model2.init_.class_prior_[1]),
"tree_output": "log_odds",
"objective": "binary_crossentropy",
"input_dtype": np.float32, # this is what type the model uses the input feature data
"internal_dtype": np.float64 # this is what type the model uses for values and thresholds
}
In [13]:
# build a background dataset for us to use based on people near a 0.95 cutoff
vs = np.abs(orig_model2.predict_proba(X2)[:,1] - 0.95)
inds = np.argsort(vs)
inds = inds[:200]
In [14]:
# build an explainer that explains the probability output of the model
explainer2 = shap.TreeExplainer(model2, X2.iloc[inds,:], feature_dependence="independent", model_output="probability")
In [15]:
# Make sure that the ingested SHAP model (a TreeEnsemble object) makes the
# same predictions as the original model
assert np.abs(explainer2.model.predict(X2, output="probability") - orig_model2.predict_proba(X2)[:,1]).max() < 1e-4
In [16]:
# make sure the sum of the SHAP values equals the model output
shap_sum = explainer2.expected_value + explainer2.shap_values(X2.iloc[:,:]).sum(1)
assert np.abs(shap_sum - orig_model2.predict_proba(X2)[:,1]).max() < 1e-4